{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Credits: https://github.com/bansalkanav/Machine_Learning_and_Deep_Learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling Missing Values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to identify missing values?\n",
    "<code>df.isnull()</code> - It returns True if values are missing\n",
    "\n",
    "\n",
    "## How to deal with missing values?\n",
    "\n",
    "- Drop the missing values.\n",
    "    * either drop the entire row\n",
    "    <br><code> df.dropna('col_name', axis = 0, inplace = True)</code>\n",
    "    * or drop the entire column\n",
    "    <br><code> df.dropna('col_name', axis = 1, inplace = True)</code>\n",
    "- Replace the missing value \n",
    "    * Use business understanding\n",
    "    * Statistical methods - Imputation (Mean, Median, Mode)\n",
    "    <br><code> df['col_name'].fillna(df.col_name.mean(), inplace = True)</code>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>?</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>-</td>\n",
       "      <td>65.0</td>\n",
       "      <td>?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age  Sex preTestScore postTestScore location\n",
       "0       abc      mno  12.0    m         90.0          65.0      NaN\n",
       "1       NaN      NaN   NaN  NaN         90.0             ?      NaN\n",
       "2       ghi      pqr  12.0    f            -          65.0        ?\n",
       "3       jkl      stu  12.0    f         90.0          62.0      NaN\n",
       "4       mno      vwx  12.0    m         89.0          63.0      NaN"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/data.csv')\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5, 7)\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 7 columns):\n",
      " #   Column         Non-Null Count  Dtype  \n",
      "---  ------         --------------  -----  \n",
      " 0   FirstName      4 non-null      object \n",
      " 1   LastName       4 non-null      object \n",
      " 2   Age            4 non-null      float64\n",
      " 3   Sex            4 non-null      object \n",
      " 4   preTestScore   5 non-null      object \n",
      " 5   postTestScore  5 non-null      object \n",
      " 6   location       1 non-null      object \n",
      "dtypes: float64(1), object(6)\n",
      "memory usage: 408.0+ bytes\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(df.shape)\n",
    "\n",
    "print(df.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age  Sex  preTestScore  postTestScore  location\n",
       "0       abc      mno  12.0    m          90.0           65.0       NaN\n",
       "1       NaN      NaN   NaN  NaN          90.0            NaN       NaN\n",
       "2       ghi      pqr  12.0    f           NaN           65.0       NaN\n",
       "3       jkl      stu  12.0    f          90.0           62.0       NaN\n",
       "4       mno      vwx  12.0    m          89.0           63.0       NaN"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_val = ['n/a', '-', '?']\n",
    "\n",
    "df = pd.read_csv('data/data.csv', na_values = missing_val)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 7 columns):\n",
      " #   Column         Non-Null Count  Dtype  \n",
      "---  ------         --------------  -----  \n",
      " 0   FirstName      4 non-null      object \n",
      " 1   LastName       4 non-null      object \n",
      " 2   Age            4 non-null      float64\n",
      " 3   Sex            4 non-null      object \n",
      " 4   preTestScore   4 non-null      float64\n",
      " 5   postTestScore  4 non-null      float64\n",
      " 6   location       0 non-null      float64\n",
      "dtypes: float64(4), object(3)\n",
      "memory usage: 408.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to identify missing values?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age  Sex  preTestScore  postTestScore  location\n",
       "0       abc      mno  12.0    m          90.0           65.0       NaN\n",
       "1       NaN      NaN   NaN  NaN          90.0            NaN       NaN\n",
       "2       ghi      pqr  12.0    f           NaN           65.0       NaN\n",
       "3       jkl      stu  12.0    f          90.0           62.0       NaN\n",
       "4       mno      vwx  12.0    m          89.0           63.0       NaN"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   FirstName  LastName    Age    Sex  preTestScore  postTestScore  location\n",
       "0      False     False  False  False         False          False      True\n",
       "1       True      True   True   True         False           True      True\n",
       "2      False     False  False  False          True          False      True\n",
       "3      False     False  False  False         False          False      True\n",
       "4      False     False  False  False         False          False      True"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        1\n",
       "LastName         1\n",
       "Age              1\n",
       "Sex              1\n",
       "preTestScore     1\n",
       "postTestScore    1\n",
       "location         5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Identifying missing values in columns\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        True\n",
       "LastName         True\n",
       "Age              True\n",
       "Sex              True\n",
       "preTestScore     True\n",
       "postTestScore    True\n",
       "location         True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Columns with atleast one missing value\n",
    "df.isnull().any(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        False\n",
       "LastName         False\n",
       "Age              False\n",
       "Sex              False\n",
       "preTestScore     False\n",
       "postTestScore    False\n",
       "location          True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Columns with all missing values\n",
    "df.isnull().all(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Number of columns with all missing values\n",
    "df.isnull().all(axis=0).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    True\n",
       "1    True\n",
       "2    True\n",
       "3    True\n",
       "4    True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Rows with atleast one missing values\n",
    "df.isnull().any(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Rows with all missing values\n",
    "df.isnull().all(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Number of rows with all missing values\n",
    "df.isnull().all(axis=1).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Missing values Treatment in Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName         20.0\n",
       "LastName          20.0\n",
       "Age               20.0\n",
       "Sex               20.0\n",
       "preTestScore      20.0\n",
       "postTestScore     20.0\n",
       "location         100.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "round(100*(df.isnull().sum()/len(df.index)), 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        20.0\n",
       "LastName         20.0\n",
       "Age              20.0\n",
       "Sex              20.0\n",
       "preTestScore     20.0\n",
       "postTestScore    20.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# removing the location Column\n",
    "df.dropna(axis = 1, how='all', inplace = True)\n",
    "\n",
    "round(100*(df.isnull().sum()/len(df.index)), 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Missing values Treatment in Rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName  Age  Sex  preTestScore  postTestScore\n",
       "1       NaN      NaN  NaN  NaN          90.0            NaN"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.isnull().sum(axis=1) >= 4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName         0.0\n",
       "LastName          0.0\n",
       "Age               0.0\n",
       "Sex               0.0\n",
       "preTestScore     25.0\n",
       "postTestScore     0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# retaining the rows having <= 4 NaNs\n",
    "df = df[df.isnull().sum(axis=1) <= 4]\n",
    "\n",
    "# look at the summary again\n",
    "round(100*(df.isnull().sum()/len(df.index)), 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age Sex  preTestScore  postTestScore\n",
       "0       abc      mno  12.0   m          90.0           65.0\n",
       "2       ghi      pqr  12.0   f           NaN           65.0\n",
       "3       jkl      stu  12.0   f          90.0           62.0\n",
       "4       mno      vwx  12.0   m          89.0           63.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     3.000000\n",
       "mean     89.666667\n",
       "std       0.577350\n",
       "min      89.000000\n",
       "25%      89.500000\n",
       "50%      90.000000\n",
       "75%      90.000000\n",
       "max      90.000000\n",
       "Name: preTestScore, dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['preTestScore'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        0.0\n",
       "LastName         0.0\n",
       "Age              0.0\n",
       "Sex              0.0\n",
       "preTestScore     0.0\n",
       "postTestScore    0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# imputing preTestScore by mean values\n",
    "df['preTestScore'].fillna(df['preTestScore'].mean(), inplace=True)\n",
    "\n",
    "round(100*(df.isnull().sum()/len(df.index)), 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>89.666667</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>62.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.000000</td>\n",
       "      <td>63.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age Sex  preTestScore  postTestScore\n",
       "0       abc      mno  12.0   m     90.000000           65.0\n",
       "2       ghi      pqr  12.0   f     89.666667           65.0\n",
       "3       jkl      stu  12.0   f     90.000000           62.0\n",
       "4       mno      vwx  12.0   m     89.000000           63.0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.19999999999999996"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fraction of rows lost\n",
    "1 - len(df.index)/5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Missing Value Imputation using KNNImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FirstName</th>\n",
       "      <th>LastName</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abc</td>\n",
       "      <td>mno</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ghi</td>\n",
       "      <td>pqr</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>jkl</td>\n",
       "      <td>stu</td>\n",
       "      <td>12.0</td>\n",
       "      <td>f</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mno</td>\n",
       "      <td>vwx</td>\n",
       "      <td>12.0</td>\n",
       "      <td>m</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  FirstName LastName   Age  Sex  preTestScore  postTestScore  location\n",
       "0       abc      mno  12.0    m          90.0           65.0       NaN\n",
       "1       NaN      NaN   NaN  NaN          90.0            NaN       NaN\n",
       "2       ghi      pqr  12.0    f           NaN           65.0       NaN\n",
       "3       jkl      stu  12.0    f          90.0           62.0       NaN\n",
       "4       mno      vwx  12.0    m          89.0           63.0       NaN"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_val = ['n/a', '-', '?']\n",
    "\n",
    "df = pd.read_csv('data/data.csv', na_values = missing_val)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FirstName        1\n",
       "LastName         1\n",
       "Age              1\n",
       "Sex              1\n",
       "preTestScore     1\n",
       "postTestScore    1\n",
       "location         5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Identifying missing values in columns\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 7 columns):\n",
      " #   Column         Non-Null Count  Dtype  \n",
      "---  ------         --------------  -----  \n",
      " 0   FirstName      4 non-null      object \n",
      " 1   LastName       4 non-null      object \n",
      " 2   Age            4 non-null      float64\n",
      " 3   Sex            4 non-null      object \n",
      " 4   preTestScore   4 non-null      float64\n",
      " 5   postTestScore  4 non-null      float64\n",
      " 6   location       0 non-null      float64\n",
      "dtypes: float64(4), object(3)\n",
      "memory usage: 408.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "      <th>location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  preTestScore  postTestScore  location\n",
       "0  12.0          90.0           65.0       NaN\n",
       "1   NaN          90.0            NaN       NaN\n",
       "2  12.0           NaN           65.0       NaN\n",
       "3  12.0          90.0           62.0       NaN\n",
       "4  12.0          89.0           63.0       NaN"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_num = df.select_dtypes(include=['int64', 'float64'])\n",
    "\n",
    "df_num.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>62.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>63.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  preTestScore  postTestScore\n",
       "0  12.0          90.0           65.0\n",
       "1   NaN          90.0            NaN\n",
       "2  12.0           NaN           65.0\n",
       "3  12.0          90.0           62.0\n",
       "4  12.0          89.0           63.0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_num = df_num.drop(['location'], axis=1)\n",
    "\n",
    "df_num.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>65.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>63.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.666667</td>\n",
       "      <td>65.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>62.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.000000</td>\n",
       "      <td>63.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  preTestScore  postTestScore\n",
       "0  12.0     90.000000      65.000000\n",
       "1  12.0     90.000000      63.333333\n",
       "2  12.0     89.666667      65.000000\n",
       "3  12.0     90.000000      62.000000\n",
       "4  12.0     89.000000      63.000000"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "knn_imputer = KNNImputer(n_neighbors = 3)\n",
    "\n",
    "df_knn_imp = pd.DataFrame(knn_imputer.fit_transform(df_num),\n",
    "                         columns=df_num.columns,\n",
    "                         index=df_num.index)\n",
    "\n",
    "df_knn_imp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>preTestScore</th>\n",
       "      <th>postTestScore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.00</td>\n",
       "      <td>65.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.00</td>\n",
       "      <td>63.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.75</td>\n",
       "      <td>65.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12.0</td>\n",
       "      <td>90.00</td>\n",
       "      <td>62.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.0</td>\n",
       "      <td>89.00</td>\n",
       "      <td>63.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  preTestScore  postTestScore\n",
       "0  12.0         90.00          65.00\n",
       "1  12.0         90.00          63.75\n",
       "2  12.0         89.75          65.00\n",
       "3  12.0         90.00          62.00\n",
       "4  12.0         89.00          63.00"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "mean_imputer = SimpleImputer(strategy='mean')\n",
    "# strategy = 'mean', 'median', 'most_frequent'\n",
    "\n",
    "df_mean_imp = pd.DataFrame(mean_imputer.fit_transform(df_num),\n",
    "                         columns=df_num.columns,\n",
    "                         index=df_num.index)\n",
    "\n",
    "df_mean_imp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
